
import nltk
from nltk import word_tokenize
from dataset import SENTENCES

def demo(sentence:list = ["词性标注"]):
    ret_seg = []
    ret_tag = []
    for s in sentence:
        rs = nltk.pos_tag(word_tokenize(s))
        ret_seg.append( [w for (w,t) in rs] )
        ret_tag.append( [t for (w,t) in rs] )
    return ret_seg,ret_tag


'''
一些特性
- 不支持中文分词 以及词性标注
- 词性标注使用 滨州词性标注集

'''

if __name__ == "__main__":
    w,t = demo(SENTENCES)
    for i,s in enumerate(w):
        print(SENTENCES[i])
        print("/".join(w[i]) )
        print("/".join(t[i]) )
        print("" )

